// Avisynth+
// https://avs-plus.net
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
// http://www.gnu.org/copyleft/gpl.html .
//
// Linking Avisynth statically or dynamically with other modules is making a
// combined work based on Avisynth.  Thus, the terms and conditions of the GNU
// General Public License cover the whole combination.
//
// As a special exception, the copyright holders of Avisynth give you
// permission to link Avisynth with independent modules that communicate with
// Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
// terms of these independent modules, and to copy and distribute the
// resulting combined work under terms of your choice, provided that
// every copy of the combined work is accompanied by a complete copy of
// the source code of Avisynth (the version of Avisynth used to produce the
// combined work), being distributed under the terms of the GNU General
// Public License plus this exception.  An independent module is a module
// which is not derived from or based on Avisynth, such as 3rd-party filters,
// import and export plugins, or graphical user interfaces.

// pinterf:
// high bit depth, planar RGB
// utf8 option, internally unicode
// info_h font definition reorganized, Latin-1 Supplement 00A0-00FF
// Configurable color
// Configurable halocolor (text outline)
// Configurable background fading
// Alignment
// multiline
// multiple size, multiple fonts, "Terminus", "info_h"
// chroma location, overlay-like weighted chroma handling

#include "info.h"
#include <cstring>
#include <sstream>
#include <fstream>
#include <unordered_map>
#include <array>
#include <iomanip>
#include <avs/filesystem.h>

#include <locale>
#include <cstdio>
#include <cassert>
#include "fonts/fixedfonts.h"
#include "strings.h"
#include "../convert/convert_helper.h"

// helper function for remapping an utf8 string to font index entry list
std::vector<int> BitmapFont::remap(const std::string& s_utf8)
{
  // new vector with characters remapped to font table indexes
  std::vector<int> s_remapped;
  const size_t real_len = str_utf8_size(s_utf8);
  s_remapped.resize(real_len);

  size_t index = 0;
  const char* p = s_utf8.data();
  const char* end = p + s_utf8.size();
  while (p < end) {
    // Get the lead byte of the current UTF-8 character
    unsigned char lb = static_cast<unsigned char>(*p);
    // Determine the number of bytes in the current UTF-8 character
    int n = 0;
    if ((lb & 0x80) == 0) n = 1; // 0xxxxxxx
    else if ((lb & 0xE0) == 0xC0) n = 2; // 110xxxxx
    else if ((lb & 0xF0) == 0xE0) n = 3; // 1110xxxx
    else if ((lb & 0xF8) == 0xF0) n = 4; // 11110xxx
    else {
      // Invalid lead byte, skip
      ++p;
      continue;
    }
    // Create a buffer to store the UTF-8 character code
    char utf8_char_buf[5] = { 0 };
    // Copy the bytes from the string to the buffer
    std::memcpy(utf8_char_buf, p, n);
    // finds by utf8 character sequence in font index table
    auto it = charReMapUtf8.find(utf8_char_buf);
    if (it != charReMapUtf8.end())
      s_remapped[index] = it->second;
    else
      s_remapped[index] = 0; // empty neutral character (space)
    index++;
    // Advance the pointer by the number of bytes
    p += n;
  }
  return s_remapped;
}

// Internal function! For creating source code from a previously LoadBDF'd font file
// see fixedfonts.cpp
void BitmapFont::SaveAsC(const uint16_t* _codepoints)
{
  if (font_filename == "") return; // no GUS no sound :)

  std::string fontname;
  if (font_filename.substr(0, 4) == "ter-")
    font_name = "Terminus";
  else
    font_name = font_filename;

  std::ostringstream ss;
  ss << "namespace fixed_font_# {" << std::endl;
  ss << "// -- start of autogenerated text ---" << std::endl;
  ss << "// definition section for font: " << font_filename << std::endl;
  ss << "constexpr int CHARCOUNT = " << std::to_string(number_of_chars) << ";" << std::endl;
  ss << "constexpr int WIDTH = " << std::to_string(width) << ";" << std::endl;
  ss << "constexpr int HEIGHT = " << std::to_string(height) << ";" << std::endl;
  ss << "constexpr FixedFont_info_t fixedfont_info = {" << std::endl;
  ss << "  \"" << font_filename << "\", // font name" << std::endl;
  ss << "  \"" << font_filename << "\", // font name internal" << std::endl;
  ss << "  CHARCOUNT, // num of chars" << std::endl;
  ss << "  WIDTH," << std::endl;
  ss << "  HEIGHT," << std::endl;
  ss << "  " << (bold ? "true" : "false") << " // bold" << std::endl;
  ss << "};" << std::endl;
  ss << "// font bitmap definitions" << std::endl;
  ss << "constexpr std::array<uint16_t, CHARCOUNT * HEIGHT> fixedfont_bitmap = {" << std::endl;
  for (int charcount = 0; charcount < number_of_chars; charcount++)
  {
    constexpr int LINES_BY_N = 16; // maximum number of constants per line
    for (int y = 0; y < height; y++) {
      ss << "0x";
      for (int x = 0; x < fontline_bytes; x++) {
        uint8_t charline = font_bitmaps[(charcount * height + y) * fontline_bytes + x];
        ss << std::setfill('0') << std::setw(2) << std::hex << charline;
      }
      const bool last = charcount == number_of_chars - 1 && y == height - 1;
      if (!last) ss << ",";
      if (y == height - 1)
        ss << " // u" << std::setw(4) << std::hex << _codepoints[charcount];
      if (y % LINES_BY_N == LINES_BY_N - 1) ss << std::endl; // last line of character- forced new line
    }
    if (height % LINES_BY_N != 0)
      ss << std::endl;
  };
  // example output:
  // 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
  // 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, // u0020
  // 0x0000,0x0000,0x0000,0x0000,0x0300,0x0300,0x0300,0x0300,0x0300,0x0300,0x0300,0x0300,0x0300,0x0300,0x0300,0x0300,
  // 0x0000,0x0000,0x0300,0x0300,0x0300,0x0300,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, // u0021

  ss << "};" << std::endl;
  ss << "// codepoints array" << std::endl;
  ss << "constexpr std::array<uint16_t, CHARCOUNT> fixedfont_codepoints = {" << std::endl;
  for (int charcount = 0; charcount < number_of_chars; charcount++)
  {
    constexpr int LINES_BY_N = 16;
    int val = _codepoints[charcount];
    ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << val;
    const bool last = charcount == number_of_chars - 1;
    if (!last) ss << ",";
    if (charcount % LINES_BY_N == LINES_BY_N - 1) ss << std::endl;
  }
  ss << "};" << std::endl;

  ss << "} // namespace" << std::endl;
  ss << "// -- end of autogenerated text ---" << std::endl;
  //
  std::ofstream outFile(font_filename + ".cpp_sample");
  outFile << ss.str();
  outFile.close();

  /* usage:
  // Predefined fonts main table
  constexpr int PREDEFINED_FONT_COUNT = 2;
  static const uint16_t* font_bitmaps[PREDEFINED_FONT_COUNT] =
  {
    &fixed_font_1::fixedfont_bitmap[0],
    &fixed_font_2::fixedfont_bitmap[0]
    // ...
  };
  static const uint16_t* font_codepoints[PREDEFINED_FONT_COUNT] =
  {
    &fixed_font_1::fixedfont_codepoints[0],
    &fixed_font_2::fixedfont_codepoints[0]
    // ...
  };
  static const FixedFont_info_t* font_infos[PREDEFINED_FONT_COUNT] =
  {
    &fixed_font_1::fixedfont_info,
    &fixed_font_2::fixedfont_info
    // ...
  };
  */
}

typedef struct CharInfo { // STARTCHAR charname
  std::string friendly_name;
  uint16_t encoding;
  int offset_x, offset_y;
  // DWIDTH not supported, same as main PIXELSIZE
  // font properties can be overridden.
  int font_bounding_box_x;
  int font_bounding_box_y;
  int font_bounding_box_bottomleft_x;
  int font_bounding_box_bottomleft_y;
  int empty_lines_bottom;
  int empty_lines_top;
  int bits_to_shift;
} CharDef;

typedef struct FontProperties {
  std::string Copyright;
  std::string Notice;
  std::string Family_name;
  std::string Weight_name;
  int pixel_size;
  int font_ascent; // 12
  int font_descent; // 4
  uint16_t default_char; // 65533
} FontProperties;

typedef struct FontInfo {
  std::string font; // n/a
  int size_points, size_dpi_x, size_dpi_y; // n/a
  // these may be overridden by individual fonts.
  int font_bounding_box_x;
  int font_bounding_box_y;
  int font_bounding_box_bottomleft_x;
  int font_bounding_box_bottomleft_y; // -4 e.g. baseline
  int chars; // number of characters
  int fontline_bytes; // length of a single font line in the byte array
} FontInfo;

class BdfFont {
public:
  std::string font_filename;
  FontInfo font_info;
  FontProperties font_properties;
  std::vector<uint16_t> codepoints_array;
  std::vector<std::string> charnames_array;
  // one character line is not limited to 16/32 pixels anymore
  std::vector<uint8_t> font_bitmaps;
};

std::string UnQuote(std::string s) {
  if (s.size() >= 2 && s.substr(0, 1) == "\"" && (s.substr(s.size() - 1, 1) == "\""))
    return s.substr(1,s.size()-2); // zero based
  return s;
}

static void vector_shl(uint8_t* buf, const size_t size, const size_t bits) {
  const size_t whole_chunks = bits / 8;

  // more bits that size
  if (whole_chunks >= size) {
    memset(buf, 0, size);
    return;
  }

  if (whole_chunks) {
    memmove(buf, buf + whole_chunks, size - whole_chunks);
    memset(buf + size - whole_chunks, 0, whole_chunks);
  }

  const uint8_t final_shift = bits % 8;

  if (final_shift) {
    const auto right = 8 - final_shift;
    const size_t len = size - whole_chunks - 1;
    for (size_t i = 0; i < len; i++) {
      buf[i] = (buf[i] << final_shift) | (buf[i + 1] >> right);
    }
    buf[len] = buf[len] << final_shift;
  }
}

static void vector_shr(uint8_t* buf, const size_t size, const size_t bits) {
  const size_t whole_chunks = bits / 8;

  // more bits that size
  if (whole_chunks >= size) {
    memset(buf, 0, size);
    return;
  }

  if (whole_chunks) {
    memmove(buf + whole_chunks, buf, size - whole_chunks);
    memset(buf, 0, whole_chunks);
  }

  const uint8_t final_shift = bits % 8;

  if (final_shift) {
    const auto left = 8 - final_shift;
    const size_t len = size - whole_chunks - 1;
    auto index = size - 1;
    for (size_t i = 0; i < len; i++) {
      buf[index] = (buf[index] >> final_shift) | (buf[index - 1] << left);
      index--;
    }
    buf[index] = buf[index] >> final_shift;
  }
}

static BdfFont LoadBMF(std::string name, bool bold) {

  // https://en.wikipedia.org/wiki/Glyph_Bitmap_Distribution_Format
  // also for make cpp source code, see BdfFont methods

  enum loadstate { ls_Init, ls_StartFont, ls_StartProperties, ls_Char };

  BdfFont fnt;
  CharInfo current_char;

  std::string temp;
  //std::ifstream ss("c:\\Download\\terminus-font-4.48\\terminus-font-4.48\\ter-u16n.bdf");
  std::ifstream ss;
  // explicite font file name
  auto fname = fs::path(name);

  if (!fs::exists(name))
    return fnt;

  fnt.font_filename = fname.filename().generic_string();
  ss.open(name);

  loadstate state = ls_Init;

  size_t line_counter = 0;
  int char_counter = 0;

  // make list governed by LF separator
  while (std::getline(ss, temp, '\n')) {
    std::string token;
    std::istringstream ssline(temp);

    std::getline(ssline, token, ' ');

    if (token.size() == 0) continue;

    switch (state) {
    case ls_Init:
      if (token == "STARTFONT")
        state = ls_StartFont;
      else {
        // unexpected token
      }
      break;
    case ls_StartFont:
      if (token == "ENDFONT") {
        state = ls_Init;
      }
      else if (token == "FONT") {
        // FONT-xos4-Terminus-Bold-R-Normal--16-160-72-72-C-80-ISO10646-1
        std::getline(ssline, token, ' ');
        fnt.font_info.font = UnQuote(token);
      }
      else if (token == "SIZE") {
        // SIZE 16 72 72
        // size in points, X and Y-axis resolution
        std::getline(ssline, token, ' ');
        fnt.font_info.size_points = std::stoi(token);
        std::getline(ssline, token, ' ');
        fnt.font_info.size_dpi_x = std::stoi(token);
        std::getline(ssline, token, ' ');
        fnt.font_info.size_dpi_y = std::stoi(token);
      }
      else if (token == "FONTBOUNDINGBOX") {
        // FONTBOUNDINGBOX 8 16 0 -4
        // bounding box of 8 pixels wide and 16 pixels high
        // lower left hand corner starting at x=0, y=-4. Note that although the bounding box is defined to be a 8x16 cell,
        // this can be overridden for individual glyphs.
        std::getline(ssline, token, ' ');
        fnt.font_info.font_bounding_box_x = std::stoi(token);
        std::getline(ssline, token, ' ');
        fnt.font_info.font_bounding_box_y = std::stoi(token);
        std::getline(ssline, token, ' ');
        fnt.font_info.font_bounding_box_bottomleft_x = std::stoi(token);
        std::getline(ssline, token, ' ');
        fnt.font_info.font_bounding_box_bottomleft_y = std::stoi(token);

        // a single font line can occupy variable amount of bytes depending on font width
        // Note: in this implementation these global font dimensions are _the_
        // dimensions of the fixed size font
        fnt.font_info.fontline_bytes = (fnt.font_info.font_bounding_box_x + 7) / 8;
      }
      else if (token == "STARTPROPERTIES") {
        // STARTPROPERTIES 20
        // do not check line count now
        state = ls_StartProperties;
      }
      else if (token == "CHARS") {
        std::getline(ssline, token);
        fnt.font_info.chars = std::stoi(token);
        // allocate area for "character_count" * "height"
        fnt.font_bitmaps.resize(fnt.font_info.chars * fnt.font_info.font_bounding_box_y * fnt.font_info.fontline_bytes);
        fnt.charnames_array.resize(fnt.font_info.chars);
        fnt.codepoints_array.resize(fnt.font_info.chars);
      }
      else if (token == "STARTCHAR") {
        std::getline(ssline, token);
        current_char.friendly_name = token;
        current_char.encoding = 0;
        current_char.offset_x = 0;
        current_char.offset_y = 0;
        current_char.font_bounding_box_x = fnt.font_info.font_bounding_box_x;
        current_char.font_bounding_box_y = fnt.font_info.font_bounding_box_y;
        current_char.font_bounding_box_bottomleft_x = fnt.font_info.font_bounding_box_bottomleft_x;
        current_char.font_bounding_box_bottomleft_y = fnt.font_info.font_bounding_box_bottomleft_y;
        current_char.bits_to_shift = 0;
        current_char.empty_lines_bottom = 0;
        current_char.empty_lines_top = 0;

        state = ls_Char;
      }
      else {
        // unexpected token
      }
      break;
    case ls_StartProperties:
      if (token == "ENDPROPERTIES") {
        state = ls_StartFont;
      }
      /*
      FAMILY_NAME "Terminus"
        FOUNDRY "xos4"
        SETWIDTH_NAME "Normal"
        ADD_STYLE_NAME ""
        COPYRIGHT "Copyright (C) 2019 Dimitar Toshkov Zhekov"
        NOTICE "Licensed under the SIL Open Font License, Version 1.1"
        WEIGHT_NAME "Bold"
        SLANT "R"
        PIXEL_SIZE 16
        POINT_SIZE 160
        RESOLUTION_X 72
        RESOLUTION_Y 72
        SPACING "C"
        AVERAGE_WIDTH 80
        CHARSET_REGISTRY "ISO10646"
        CHARSET_ENCODING "1"
        MIN_SPACE 8
        */
      else if (token == "FAMILY_NAME") {
        //FAMILY_NAME "Terminus"
        std::getline(ssline, token, ' ');
        fnt.font_properties.Family_name = UnQuote(token);
      }
      else if (token == "COPYRIGHT") {
        std::getline(ssline, token, ' ');
        fnt.font_properties.Copyright = UnQuote(token);
      }
      else if (token == "NOTICE") {
        std::getline(ssline, token, ' ');
        fnt.font_properties.Notice = UnQuote(token);
      }
      else if (token == "WEIGHT_NAME") {
        // "Medium", "Bold"
        std::getline(ssline, token, ' ');
        fnt.font_properties.Weight_name = UnQuote(token);
      }
      else if (token == "PIXEL_SIZE") {
        std::getline(ssline, token, ' ');
        fnt.font_properties.pixel_size = std::stoi(token);
      }
      else if (token == "FONT_ASCENT") {
        // FONT_ASCENT 12
        // 12 of the 16 pixels in height are above the baseline.
        std::getline(ssline, token, ' ');
        fnt.font_properties.font_ascent = std::stoi(token);
      }
      else if (token == "FONT_DESCENT") {
        // FONT_DESCENT 4
        // 4 of the 16 pixels in height are below the baseline
        std::getline(ssline, token, ' ');
        fnt.font_properties.font_descent = std::stoi(token);
      }
      else if (token == "DEFAULT_CHAR") {
        // DEFAULT_CHAR 65533
        std::getline(ssline, token, ' ');
        fnt.font_properties.default_char = std::stoi(token);
      }
      break;
    case ls_Char:
      if (token == "ENDCHAR") {
        // add to vector
        state = ls_StartFont;
      }
      else if (token == "DWIDTH") {
        // DWIDTH 9 0
        // declares the Device Width of a glyph. After the glyph is rendered, the start of the next glyph is 
        // offset 9 pixels on the X-axis and 
        // offset 0 pixels on the Y-axis from the current glyph origin.
        // They are not necessarily equal to the width of the glyph. 
        // It is simply the offset on the X-axis to move the current point to the start of the next glyph. 
        // not used in Avisynth, fonts are of fixed width
        std::getline(ssline, token, ' ');
        current_char.offset_x = std::stoi(token);
        std::getline(ssline, token, ' ');
        current_char.offset_y = std::stoi(token);
      }
      else if (token == "ENCODING") {
        // ENCODING 32
        std::getline(ssline, token, ' ');
        current_char.encoding = std::stoi(token);
      }
      else if (token == "BBX") {
        // BBX 8 16 0 -4 bounding box. 8 pixels wide and 16 pixels tall;
        // lower left corner is offset by 0 on the X and -4 pixels on the Y axis.
        std::getline(ssline, token, ' ');
        current_char.font_bounding_box_x = std::stoi(token);
        std::getline(ssline, token, ' ');
        current_char.font_bounding_box_y = std::stoi(token);
        std::getline(ssline, token, ' ');
        current_char.font_bounding_box_bottomleft_x = std::stoi(token);
        std::getline(ssline, token, ' ');
        current_char.font_bounding_box_bottomleft_y = std::stoi(token);
        current_char.empty_lines_bottom = current_char.font_bounding_box_bottomleft_y - fnt.font_info.font_bounding_box_bottomleft_y;
        current_char.empty_lines_top = fnt.font_info.font_bounding_box_y - (current_char.empty_lines_bottom + current_char.font_bounding_box_y);
        current_char.bits_to_shift = current_char.font_bounding_box_bottomleft_x - fnt.font_info.font_bounding_box_bottomleft_x;
      }
      else if (token == "BITMAP") {
        /* space:
          STARTCHAR space
          ENCODING 32
          SWIDTH 692 0
          DWIDTH 9 0
          BBX 0 0 0 0
          BITMAP
          ENDCHAR

          or

          6x12:
          BITMAP
          00
          00
          70
          88
          08
          30
          08
          08
          88
          70
          00
          00
          ENDCHAR

          10x18:
          BITMAP
          0000
          0000
          0000
          3F00
          6180
          6180
          0180
          0180
          1F00
          0180
          0180
          0180
          6180
          6180
          3F00
          0000
          0000
          0000
          ENDCHAR
        */
        fnt.codepoints_array[char_counter] = current_char.encoding;
        fnt.charnames_array[char_counter] = current_char.friendly_name;
        char_counter++;

        // by spec: charlines are left aligned within byte boundaries

        // fill empty top lines
        for (int count = 0; count < current_char.empty_lines_top; count++) {
          // fontline_bytes bytes per line
          for(int i = 0; i < fnt.font_info.fontline_bytes; i++)
            fnt.font_bitmaps[line_counter++] = 0;
        }

        // one character line is of (almost) arbitrary length, not limited to 16 or 32 bits
        std::vector<uint8_t> charline_buffer(fnt.font_info.fontline_bytes);
        const size_t linebuf_len = fnt.font_info.fontline_bytes;

        for(int count = 0; count < current_char.font_bounding_box_y; count++)
        {
          std::getline(ss, temp);

          size_t len = temp.length();
          const size_t bytes_defined = len / 2;

          if (len % 2) {
            // weird, one byte must be defined on two hexadecimal characters
            // FIXME: should give an error
          }
          if (bytes_defined > linebuf_len) {
            // weird, more bytes defined that needed for the font width defined in the header
            // FIXME: should give an error
            len = linebuf_len * 2; // until then a safe limit
          }
          size_t buf_ctr = 0;
          // two hex characters by two character then put into byte buffer
          for (size_t i = 0; i < len; i += 2) {
            auto ssss = temp.substr(i, 2); // next two hex chars
            charline_buffer[buf_ctr++] = (uint8_t)std::stoul(ssss, nullptr, 16);;
          }
          for (auto i = buf_ctr; i < linebuf_len; i++)
            charline_buffer[i] = 0;

          // shift full buffer, they are msb...lsb for increasing addresses
          if (current_char.bits_to_shift < 0) {
            const int bits_to_shift_left = -current_char.bits_to_shift;
            vector_shl(charline_buffer.data(), fnt.font_info.fontline_bytes, bits_to_shift_left);
          }
          else if (current_char.bits_to_shift > 0) {
            const int bits_to_shift_right = current_char.bits_to_shift;
            vector_shr(charline_buffer.data(), fnt.font_info.fontline_bytes, bits_to_shift_right);
          }

          for(int i = 0; i < fnt.font_info.fontline_bytes; i++) {
            fnt.font_bitmaps[line_counter++] = charline_buffer[i];
          }
        }

        // fill empty bottom lines
        for (int count = 0; count < current_char.empty_lines_bottom; count++) {
          // fontline_bytes bytes per line
          for (int i = 0; i < fnt.font_info.fontline_bytes; i++)
            fnt.font_bitmaps[line_counter++] = 0;
        }

      }
      break;
    }
  }

  return fnt;
}
  /*

  STARTFONT 2.1
    FONT -xos4-Terminus-Bold-R-Normal--16-160-72-72-C-80-ISO10646-1
    SIZE 16 72 72
    FONTBOUNDINGBOX 8 16 0 -4

    STARTPROPERTIES 20
      FAMILY_NAME "Terminus"
      FOUNDRY "xos4"
      SETWIDTH_NAME "Normal"
      ADD_STYLE_NAME ""
      COPYRIGHT "Copyright (C) 2019 Dimitar Toshkov Zhekov"
      NOTICE "Licensed under the SIL Open Font License, Version 1.1"
      WEIGHT_NAME "Bold"
      SLANT "R"
      PIXEL_SIZE 16
      POINT_SIZE 160
      RESOLUTION_X 72
      RESOLUTION_Y 72
      SPACING "C"
      AVERAGE_WIDTH 80
      CHARSET_REGISTRY "ISO10646"
      CHARSET_ENCODING "1"
      MIN_SPACE 8
      FONT_ASCENT 12
      FONT_DESCENT 4
      DEFAULT_CHAR 65533
    ENDPROPERTIES

    CHARS 1354

    STARTCHAR space
      ENCODING 32
      SWIDTH 500 0
      DWIDTH 8 0
      BBX 8 16 0 -4
      BITMAP
      ...
      00
    ENDCHAR

  ENDFONT
  */

static constexpr int ATA_LEFT = 1;
static constexpr int ATA_RIGHT = 2;
static constexpr int ATA_CENTER = 4;

static constexpr int ATA_TOP = 8;
static constexpr int ATA_BOTTOM = 16;
static constexpr int ATA_BASELINE = 32;

static int alignToBitmask(int align_1_to_9)
{
  // alignment 1-9: digit positions on numeric keypad
  int al = 0;
  switch (align_1_to_9) // This spec where [X, Y] is relative to the text (inverted logic)
  {
  case 1: al = ATA_BOTTOM | ATA_LEFT; break;     // .----
  case 2: al = ATA_BOTTOM | ATA_CENTER; break;   // --.--
  case 3: al = ATA_BOTTOM | ATA_RIGHT; break;    // ----.
  case 4: al = ATA_BASELINE | ATA_LEFT; break;   // .____
  case 5: al = ATA_BASELINE | ATA_CENTER; break; // __.__
  case 6: al = ATA_BASELINE | ATA_RIGHT; break;  // ____.
  case 7: al = ATA_TOP | ATA_LEFT; break;        // `----
  case 8: al = ATA_TOP | ATA_CENTER; break;      // --`--
  case 9: al = ATA_TOP | ATA_RIGHT; break;       // ----`
  default: al = ATA_BASELINE | ATA_LEFT; break;  // .____
  }
  return al;
}

static int getColorForPlane(int plane, int color)
{
  switch (plane) {
  case PLANAR_A:
    return (color >> 24) & 0xff; break;
  case PLANAR_R:
  case PLANAR_Y:
    return (color >> 16) & 0xff; break;
  case PLANAR_G:
  case PLANAR_U:
    return (color >> 8) & 0xff; break;
  case PLANAR_B:
  case PLANAR_V:
    return color & 0xff; break;
  }
  return color & 0xFF;
}

template<typename pixel_t, bool fadeBackground>
void AVS_FORCEINLINE LightOnePixelPackedRGB(const bool lightIt, BYTE* _dp, int val_color_R, int val_color_G, int val_color_B)
{
  pixel_t* dp = reinterpret_cast<pixel_t*>(_dp);
  if (lightIt) {
    dp[0] = val_color_B;
    dp[1] = val_color_G;
    dp[2] = val_color_R;
  }
  else {
    if constexpr (fadeBackground) {
      dp[0] = (pixel_t)((dp[0] * 7) >> 3);
      dp[1] = (pixel_t)((dp[1] * 7) >> 3);
      dp[2] = (pixel_t)((dp[2] * 7) >> 3);
    }
  }
}

template<typename pixel_t, bool fadeBackground, bool isRGB>
void AVS_FORCEINLINE LightOnePixel(const bool lightIt, pixel_t* dstp, int j, pixel_t& val_color, int bits_per_pixel)
{
  // some optimization hint
  if constexpr (sizeof(pixel_t) == 1)
    bits_per_pixel = 8;
  else if constexpr (sizeof(pixel_t) == 4)
    bits_per_pixel = 32;

  if (lightIt) { // character definition bits aligned to msb
    dstp[j] = val_color;
  }
  else {
    // 16 = y_min
    // speed optimization: one subtraction less, 5-8% faster
    // (((Y - 16) * 7) >> 3) + 16 = ((Y * 7) >> 3) + 2
    // in general: ((Y * 7) >> 3) + n, where n = range_min - ((range_min * 7) >> 3)
    if constexpr (fadeBackground) {
      // background darkening
      if constexpr (isRGB) {
        if constexpr (sizeof(pixel_t) != 4)
          dstp[j] = (pixel_t)((dstp[j] * 7) >> 3);
        else {
          constexpr float factor = 7.0f / 8;
          dstp[j] = (pixel_t)(dstp[j] * factor);
        }
      }
      else {
        if constexpr (sizeof(pixel_t) != 4) {
          const int range_min = 16 << (bits_per_pixel - 8);
          const int n = range_min - ((range_min * 7) >> 3);
          dstp[j] = (pixel_t)(((dstp[j] * 7) >> 3) + n); // (_dstp[j] - range_min) * 7) >> 3) + range_min);
        }
        else {
          constexpr float range_min_f = 16.0f / 255.0f;
          dstp[j] = (pixel_t)(((dstp[j] - range_min_f) * 7 / 8) + range_min_f);
        }
      }
    }
  }
}

template<typename pixel_t, int logXRatioUV, int logYRatioUV, bool fadeBackground, ChromaLocationMode chromaMode>
static void LightOneUVPixel(pixel_t* dstpU, int j, pixel_t* dstpV, pixel_t& font_color_u, pixel_t& font_color_v, pixel_t& halo_color_u, pixel_t& halo_color_v,
  int fontpixelcount, int halopixelcount,
  int bits_per_pixel
)
{
  if constexpr (!fadeBackground) {
    if (halopixelcount == 0 && fontpixelcount == 0) return; // no change, keep background
  }

  // some optimization hint
  if constexpr (sizeof(pixel_t) == 1)
    bits_per_pixel = 8;
  else if constexpr (sizeof(pixel_t) == 4)
    bits_per_pixel = 32;

  // weighed count
  constexpr int totalpixelcount =
    (chromaMode == LEFT_420) ? 8 : // 1-2-1 | 1-2-1
    (chromaMode == LEFT_422) ? 4 : // 1-2-1
    (chromaMode == CENTER_420) ? 4 : // 1-1 | 1-1
    (chromaMode == CENTER_422) ? 2 : // 1-1
    (chromaMode == CENTER_411) ? 4 : // 1-1-1-1
    1; // unreached

  if (fontpixelcount == totalpixelcount) {
    dstpU[j] = font_color_u;
    dstpV[j] = font_color_v;
  }
  else if (halopixelcount == totalpixelcount) {
    dstpU[j] = halo_color_u;
    dstpV[j] = halo_color_v;
  }
  else {
    // not reached, only when subsampled
    pixel_t actualU = dstpU[j];
    pixel_t actualV = dstpV[j];
    const int backgroundpixelcount = totalpixelcount - fontpixelcount - halopixelcount;

    if constexpr (fadeBackground) {
      // have to fade the existing background color
      // speed optimization: one subtraction less
      // (((U - 128) * 7) >> 3) + 128 = ((U * 7) >> 3) + 16
      // in general: ((U * 7) >> 3) + n where n = range_half - ((range_half * 7) >> 3)
      if constexpr (sizeof(pixel_t) != 4) {
        int range_half = 1 << (bits_per_pixel - 1);
        int n = range_half - ((range_half * 7) >> 3);
        actualU = (pixel_t)(((actualU * 7) >> 3) + n); // ((((U - range_half) * 7) >> 3) + range_half);
        actualV = (pixel_t)(((actualV * 7) >> 3) + n);
      }
      else {
        constexpr float chroma_center = 0.0f; // ancient times this was 0.5
        constexpr float factor = 7.0f / 8.0f;
        actualU = (pixel_t)(((actualU - chroma_center) * factor) + chroma_center);
        actualV = (pixel_t)(((actualV - chroma_center) * factor) + chroma_center);
      }
    }

    // compute resulting color weighted by pixel kinds
    if constexpr (sizeof(pixel_t) != 4) {
      constexpr int rounder = totalpixelcount >> 1;
      constexpr int divshift = 
        (chromaMode == LEFT_420) ? 3 :
        (chromaMode == LEFT_422) ? 2 :
        (chromaMode == CENTER_420) ? 2 :
        (chromaMode == CENTER_422) ? 1 :
        (chromaMode == CENTER_411) ? 2 :
        0; // unreached

      const int effective_color_u = (font_color_u * fontpixelcount + halo_color_u * halopixelcount + actualU * backgroundpixelcount + rounder);
      const int effective_color_v = (font_color_v * fontpixelcount + halo_color_v * halopixelcount + actualV * backgroundpixelcount + rounder);
      dstpU[j] = effective_color_u >> divshift;
      dstpV[j] = effective_color_v >> divshift;
    }
    else {
      constexpr float chroma_center = 0.0f;
      float effective_color_u = (font_color_u - chroma_center) * fontpixelcount + (halo_color_u - chroma_center) * halopixelcount + (actualU - chroma_center) * backgroundpixelcount;
      float effective_color_v = (font_color_v - chroma_center) * fontpixelcount + (halo_color_v - chroma_center) * halopixelcount + (actualV - chroma_center) * backgroundpixelcount;
      dstpU[j] = effective_color_u / totalpixelcount + chroma_center;
      dstpV[j] = effective_color_v / totalpixelcount + chroma_center;
    }
  }
}

static void adjustWriteLimits(std::vector<int>& s, const int width, const int height,
  const int FONT_WIDTH, const int FONT_HEIGHT, 
  int align, 
  const bool useHalocolor,
  int& x, int& y, int& len, int& startindex, int& xstart, int& ystart, int& yend)
{
  const int al = alignToBitmask(align);

  // alignment X
  if (al & ATA_RIGHT)
    x -= (FONT_WIDTH * len - 1);
  else if (al & ATA_CENTER)
    x -= (FONT_WIDTH * len / 2);

  // alignment Y
  if (al & ATA_BASELINE)
    y -= FONT_HEIGHT / 2;
  else if (al & ATA_BOTTOM)
    y -= (FONT_HEIGHT - 1);

  const int final_height = useHalocolor ? FONT_HEIGHT + 2 : FONT_HEIGHT;
  if (useHalocolor) y = y - 1; // one more top line, adjust y (we do it after the alignment!)

  // Chop text if exceed right margin
  // keep last character of which at least one pixel can be drawn
  if (len * FONT_WIDTH > width - x)
    len = (width - x + FONT_WIDTH - 1) / FONT_WIDTH;
  // FIXME: what if the character not, but its left side halo would be seen?

  startindex = 0;
  xstart = 0;
  // Chop 1st char if exceed left margin
  if (x < 0) {
    startindex = (-x) / FONT_WIDTH;
    xstart = (-x) % FONT_WIDTH;
    x = 0;
  }

  ystart = 0;
  yend = final_height;
  // Chop font if exceed bottom margin
  if (y > height - final_height)
    yend = height - y;

  // Chop font if exceed top margin
  if (y < 0) {
    ystart = -y;
    y = 0;
  }

  // Roll in start index
  if (startindex > 0) {
    s.erase(s.begin(), s.begin() + startindex);
    len -= startindex;
  }
}

// Inserts leftmost 'bitcount' bits of fontlinebuf at the end of the byte buffer dst,
// Target is the 'bitposition'th bit
static void insert_from_msb_bit(uint8_t *dst, int bitposition, const uint8_t* fontlinebuf, int fontlinebuf_size, int bitcount)
{
  int pos = bitposition / 8;
  int bitindex = bitposition % 8;
  if (bitindex > 0) {
    const int usable_msb_count = 8 - bitindex;
    //                     mask       FF >> (bitindex)
    // bitindex 1   mask = 10000000   01111111 
    // bitindex 7   mask = 11111110   00000001
    const uint8_t mask = (uint8_t)(0xFF >> bitindex); // leftmost (msb) N bits
    const uint8_t partial_val = (uint8_t)((fontlinebuf[0] >> bitindex) & mask); // shift leftmost bits lsb
    dst[pos++] |= partial_val; // or'd with existing

    bitcount -= usable_msb_count;

    // deal with shifted buffer from now on
    int bufpos = 0;
    while (bitcount > 0) {
      uint8_t val;
      if (bufpos <fontlinebuf_size - 1)
        val = (fontlinebuf[bufpos] << usable_msb_count) | (fontlinebuf[bufpos + 1] >> (8 - usable_msb_count));
      else 
        val = (fontlinebuf[bufpos] << usable_msb_count); // suppose that unused lsb bits were 0 in the end

      // put actual byte into buffer
      dst[pos++] = val;
      bufpos++;
      bitcount -= 8;
    }
    return;
  }

  int bufpos = 0;
  while (bitcount > 0) {
    // put actual byte into buffer
    dst[pos++] = fontlinebuf[bufpos++]; // suppose that unused lsb bits were 0 in the end
    bitcount -= 8;
  }
}

static uint8_t get_bit(uint8_t* src, const int bitposition)
{
  int pos = bitposition / 8;
  int bitindex = bitposition % 8;
  const uint8_t mask = 1 << (7 - bitindex);
  return src[pos] & mask;
}

static int get_bits(uint8_t* src, int bitposition, int count)
{
  int bitcounter = 0;
  while (count--) {
    if (0 != get_bit(src, bitposition++))
      bitcounter++;
  }
  return bitcounter;
}

void PreRendered::make_outline() {
  auto h = stringbitmap.size();
  auto w = stringbitmap[0].size();

  // circular line buffer for holding precalculated shifted lines
  std::vector<uint8_t> buf1(w);
  std::vector<uint8_t> buf2(w);
  std::vector<uint8_t> buf3(w);

  // it's unnecessary to mask the last byte against valid bitcounter, they won't drawn
  // const uint8_t mask = ~(0xFF >> (8 - (bitcounter % 8)));

  // shift a line left and rights and result is or'd
  auto make_dizzyLR = [](uint8_t* dst, auto src, size_t w) {
    if (w == 1)
    {
      *dst = (src[0] << 1) | (src[0] >> 1);
      return;
    }
    // leftmost
    uint8_t left = (src[0] << 1) | (src[1] >> (8 - 1));
    uint8_t right = 0 | (src[0] >> 1);
    *dst = left | right;
    dst++;
    src++;
    // middle
    for (size_t i = 1; i < w - 1; ++i)
    {
      left = (src[0] << 1) | (src[1] >> (8 - 1));
      right = (src[-1] << (8 - 1)) | (src[0] >> 1);
      *dst++ = left | right;
      src++;
    }
    // rightmost
    left = (src[0] << 1) | 0;
    right = (src[-1] << (8 - 1)) | (src[0] >> 1);
    *dst = left | right;
    };

#ifdef MSVC_PURE
  // MSVC's optimizer fail (as of 17.8.4, release, optimize for speed, SSE2) would
  // Workaround. Bad code generated. Compiler bug. Fixed (said) in 17.9 preview 3
  // https://developercommunity.visualstudio.com/t/Bad-c-codegen-in-1784-x64-unless-se/10565370
  volatile uint8_t* src_prev;
  volatile uint8_t* src;
#else
  uint8_t* src_prev;
  uint8_t* src;
#endif
  uint8_t* src_next;
  uint8_t* dst;

  uint8_t* tmp_line;
  uint8_t* prev_line_LR = buf1.data();
  uint8_t* curr_line_LR = buf2.data();
  uint8_t* next_line_LR = buf3.data();

  // line 0, no previous line
  size_t y = 0;
  dst = stringbitmap_outline[y].data();
  src = stringbitmap[y].data();
  src_next = stringbitmap[y + 1].data();
  make_dizzyLR(curr_line_LR, src, w);
  make_dizzyLR(next_line_LR, src_next, w);
  for (size_t x = 0; x < w; x++) {
    dst[x] = (curr_line_LR[x] | next_line_LR[x] | src_next[x]) & ~src[x];
  }
  tmp_line = prev_line_LR;
  prev_line_LR = curr_line_LR;
  curr_line_LR = next_line_LR;
  next_line_LR = tmp_line;

  src_prev = src;
  src = src_next;
  y++;

  // middle lines, y runs on 1..(h-2)
  for (; y < h - 1; y++)
  {
    dst = stringbitmap_outline[y].data();
    src_next = stringbitmap[y + 1].data();
    make_dizzyLR(next_line_LR, src_next, w);
    for (size_t x = 0; x < w; x++) {
      dst[x] = (prev_line_LR[x] | curr_line_LR[x] | next_line_LR[x] | src_prev[x] | src_next[x]) & ~src[x];
    }
    tmp_line = prev_line_LR;
    prev_line_LR = curr_line_LR;
    curr_line_LR = next_line_LR;
    next_line_LR = tmp_line;
    src_prev = src;
    src = src_next;
  }
  // last one, no next line
  dst = stringbitmap_outline[y].data();
  for (size_t x = 0; x < w; x++) {
    dst[x] = (prev_line_LR[x] | curr_line_LR[x] | src_prev[x]) & ~src[x];
  }
}

PreRendered::PreRendered(
  const uint8_t* fonts,
  const int fontline_bytes_storage,
  const int _width, const int _height,
  int _x, int _y, // they may change
  std::vector<int>& s,
  int align,
  const bool _useHalocolor,
  const int FONT_WIDTH, const int FONT_HEIGHT,
  const int _safety_bits_x_left,
  const int _safety_bits_x_right
  )
  :
  useHalocolor(_useHalocolor), width(_width), height(_height),
  safety_bits_x_left(_safety_bits_x_left),
  safety_bits_x_right(_safety_bits_x_right)
{
  len = (int)s.size();
  x = _x;
  y = _y;
  xstart = 0;
  ystart = 0;
  yend = 0;
  text_width = 0;

  int startindex = 0;

  // optional two additional lines for top and bottom for outline
  stringbitmap_height = useHalocolor ? FONT_HEIGHT + 2 : FONT_HEIGHT;

  adjustWriteLimits(s, width, height,
    FONT_WIDTH,
    FONT_HEIGHT, // extra top and bottom for halo
    align,
    useHalocolor,
    // adjusted parameters
    x, y, len, startindex, xstart, ystart, yend);

  if (len <= 0)
    return;

  // prepare font mask and outline mask

  // left-right safety bits for horizontal subsampled cases
  const int stringbitmap_width = safety_bits_x_left + FONT_WIDTH * len + (useHalocolor ? 2 : 0) + safety_bits_x_right;
  const int stringbitmapline_bytes = (stringbitmap_width + 7) / 8;

  // allocate actual space
  stringbitmap.resize(stringbitmap_height);
  for (auto& subarray : stringbitmap) subarray.resize(stringbitmapline_bytes);
  if (useHalocolor) {
    stringbitmap_outline.resize(stringbitmap_height);
    for (auto& subarray : stringbitmap_outline) subarray.resize(stringbitmapline_bytes);
  }

  // fill matrix with fonts
  // optionally leave spare left-right-top-bottom pixel lines for out-of-matrix halo pixels
  // plus a final zero-cost bit column for safely drawing unaligned horizontal subsampling cases
  uint8_t zerobyte[1] = { 0 };
  int bitcounter = 0;

  // safety columns for horizontal subsampling cases
  // fill another column with 0
  for (int ty = 0; ty < stringbitmap_height; ty++) {
    insert_from_msb_bit(&stringbitmap[ty][0], bitcounter, &zerobyte[0], 1, safety_bits_x_left);
  }
  bitcounter += safety_bits_x_left;

  if (useHalocolor) {
    // fill leftmost 1 bitcolumn with 0
    for (int ty = 0; ty < stringbitmap_height; ty++) {
      uint8_t zerobyte[1] = { 0 };
      insert_from_msb_bit(&stringbitmap[ty][0], 0, &zerobyte[0], 1, 1);
    }
    bitcounter++;
  }

  // if outline needed, leave place for top outline, copy actual character to the second row
  const int ypos_of_char_in_bitmap = useHalocolor ? 1 : 0;

  for (int i = 0; i < len; i++) {
    int num = s[i];
    const uint8_t* fontlinebuf = &fonts[num * FONT_HEIGHT * fontline_bytes_storage];
    for (int ty = 0; ty < FONT_HEIGHT; ty++) {
      // stuff FONT_WIDTH bits from fontline into bitcounter_th bit of target line
      insert_from_msb_bit(&stringbitmap[ty + ypos_of_char_in_bitmap][0], bitcounter, fontlinebuf, fontline_bytes_storage, FONT_WIDTH);
      fontlinebuf += fontline_bytes_storage;
    }
    bitcounter += FONT_WIDTH;
  }
  if (useHalocolor) {
    // fill rightmost 'safety_bits' bitcolumn with 0
    for (int ty = 0; ty < stringbitmap_height; ty++) {
      insert_from_msb_bit(&stringbitmap[ty][0], bitcounter, &zerobyte[0], 1, 1);
    }
    bitcounter++;
  }

  // safety columns for horizontal subsampling cases
  // fill another column with 0
  for (int ty = 0; ty < stringbitmap_height; ty++) {
    insert_from_msb_bit(&stringbitmap[ty][0], bitcounter, &zerobyte[0], 1, safety_bits_x_right);
  }
  bitcounter += safety_bits_x_right;

  assert(bitcounter == stringbitmap_width);

  if (useHalocolor)
    make_outline();

  // actual visible pixel count
  text_width = (FONT_WIDTH - xstart) + (len - 1) * FONT_WIDTH;
  if (x + text_width > width) text_width -= (x + text_width - width);

  if (useHalocolor) {
    // x was calculated w/o the extra left-right padding
    xstart += 1; // worst case, skip leftmost extra halo column
    if (x + xstart - 1 >= 0 && x > 0) {
      x -= 1;
      xstart -= 1;
      text_width += 1;
    }
    // still good an increased text_width for last halo column?
    if (x + text_width + 1 < width)
      text_width += 1;
  }
}

template<typename pixel_t>
static auto getHBDColor_UV(int color, int bits_per_pixel)
{
  if (bits_per_pixel < 32)
    return (pixel_t)(color << (bits_per_pixel - 8));
  constexpr float shift = 0.0f;
  return (pixel_t)((color - 128) / 255.0f + shift);
  // FIXME: consistently using limited->fullscale conversion for float
}

template<typename pixel_t>
static auto getHBDColor_Y(int color, int bits_per_pixel)
{
  if (bits_per_pixel < 32)
    return (pixel_t)(color << (bits_per_pixel - 8));
  return (pixel_t)(color / 255.0f); // 0..255 -> 0..1.0
  // FIXME: consistently using limited->fullscale conversion for float
}

template<typename pixel_t>
static auto getHBDColor_RGB(int color, int bits_per_pixel)
{
  if (bits_per_pixel <= 16) {
    const int max_pixel_value = (1 << (bits_per_pixel & 31)) - 1;
    return (pixel_t)((float)color * max_pixel_value / 255); // 0..255 --> 0..1023,4095,16383,65535
  }
  return (pixel_t)(color / 255.0f); // 0..255 -> 0..1.0
}


template<typename pixel_t, bool useHalocolor, bool fadeBackground, bool isRGB>
void Render1by1Planes(int bits_per_pixel, int color, int halocolor, int* pitches, BYTE** dstps, PreRendered& pre,
  const int planeCount, const bool is444)
{
  // 1:1 planes, Y or planar RGB or 4:4:4 U/V

  int planes_y[4] = { PLANAR_Y, PLANAR_U, PLANAR_V, PLANAR_A };
  int planes_r[4] = { PLANAR_G, PLANAR_B, PLANAR_R, PLANAR_A };
  int* planes = isRGB ? planes_r : planes_y;

  for (int p = 0; p < planeCount; p++)
  {
    const int plane = planes[p];

    if (!(isRGB || plane == PLANAR_Y || ((plane == PLANAR_U || plane == PLANAR_V) && is444)))
      continue; // Y, R, G, B is O.K. U, V is OK if 444

    pixel_t val_color;
    pixel_t val_color_outline;

    const int planecolor = getColorForPlane(plane, color);
    const int planecolor_outline = getColorForPlane(plane, halocolor);
    if (isRGB) {
      val_color = getHBDColor_RGB<pixel_t>(planecolor, bits_per_pixel);
      val_color_outline = getHBDColor_RGB<pixel_t>(planecolor_outline, bits_per_pixel);
    }
    else if (plane == PLANAR_U || plane == PLANAR_V) {
      val_color = getHBDColor_UV<pixel_t>(planecolor, bits_per_pixel);
      val_color_outline = getHBDColor_UV<pixel_t>(planecolor_outline, bits_per_pixel);
    }
    else {// Y
      val_color = getHBDColor_Y<pixel_t>(planecolor, bits_per_pixel);
      val_color_outline = getHBDColor_Y<pixel_t>(planecolor_outline, bits_per_pixel);
    }

    const int pitch = pitches[p];
    BYTE* dstp = dstps[p] + pre.x * sizeof(pixel_t) + pre.y * pitch;

    // Start rendering
    for (int ty = pre.ystart; ty < pre.yend; ty++) {
      pixel_t* _dstp = reinterpret_cast<pixel_t*>(dstp);
      uint8_t* fontline_ptr = pre.stringbitmap[ty].data();
      [[maybe_unused]] uint8_t* fontoutline_ptr;
      if constexpr(useHalocolor)
        fontoutline_ptr = pre.stringbitmap_outline[ty].data();
      int j = 0;
      const int shifted_xstart = pre.safety_bits_x_left + pre.xstart;
      for (int tx = shifted_xstart; tx < shifted_xstart + pre.text_width; tx++)
      {
        const bool lightIt = 0 != get_bit(fontline_ptr, tx);
        LightOnePixel<pixel_t, fadeBackground, isRGB>(lightIt, _dstp, j, val_color, bits_per_pixel);
        if constexpr(useHalocolor) {
          if (!lightIt)
          {
            const bool lightIt_outline = 0 != get_bit(fontoutline_ptr, tx);
            LightOnePixel<pixel_t, fadeBackground, isRGB>(lightIt_outline, _dstp, j, val_color_outline, bits_per_pixel);
          }
        }
        j += 1;
      }
      dstp += pitch;
    }
  }
}

template<typename pixel_t, bool useHalocolor, bool fadeBackground, int logXRatioUV, int logYRatioUV, ChromaLocationMode chromaMode>
void RenderUV(int bits_per_pixel, int color, int halocolor, int* pitches, BYTE** dstps, PreRendered& pre)
{
  // some optimization hint
  if constexpr (sizeof(pixel_t) == 1)
    bits_per_pixel = 8;
  else if constexpr (sizeof(pixel_t) == 4)
    bits_per_pixel = 32;

  // draw U and V in one step
  pixel_t color_u = getHBDColor_UV<pixel_t>(getColorForPlane(PLANAR_U, color), bits_per_pixel);
  pixel_t color_v = getHBDColor_UV<pixel_t>(getColorForPlane(PLANAR_V, color), bits_per_pixel);
  pixel_t color_outline_u = getHBDColor_UV<pixel_t>(getColorForPlane(PLANAR_U, halocolor), bits_per_pixel);
  pixel_t color_outline_v = getHBDColor_UV<pixel_t>(getColorForPlane(PLANAR_V, halocolor), bits_per_pixel);

  const int pitchUV = pitches[1];
  const int offset = (pre.x >> logXRatioUV) * sizeof(pixel_t) + (pre.y >> logYRatioUV) * pitchUV;
  BYTE* dstpU = dstps[1] + offset;
  BYTE* dstpV = dstps[2] + offset;

  // .SubS = 1, 2 or 4
  constexpr int xSubS = 1 << logXRatioUV;
  constexpr int ySubS = 1 << logYRatioUV;

  /*
    U and V handling, multiple luma/outline/background source for a given chroma point
    resulting chroma is a weighted sum of the three pixel kinds (font/outline/background)
    012345678901
    ...#OOO#....
    ..#O###O#...
    ..#O#.#O#...
    ..#O#.#O#...
    ..#O#.#O#...
  */

  // unaligned x: for horizontal subsampling 420, 422 and 411:
  // 420, 422, 411 horizontal subsampling: one more loop because of the leftmost orphan pixel(s)
  // we can overaddress on the right, additional safety bit column(s) were added for the bitmap
  const bool unaligned_x_start = logXRatioUV > 0 && 0 != pre.x % xSubS;
  const int xplus = unaligned_x_start ? xSubS : 0; // extra orphan bits affecting rightmost chroma
  const int xshift = pre.x % xSubS; // when aligned --> 0
  // unaligned y: vertical subsampling 420
  const bool odd_y_start = logYRatioUV == 1 && 0 != pre.y % 2;
  const int yshift = odd_y_start ? 1 : 0;

  constexpr bool hasVerticalSubsample = logYRatioUV > 0;

  // safe zero array for vertical subsampled 4:2:0 case for orphan top/bottom
  std::vector<uint8_t> zeros;
  if constexpr (hasVerticalSubsample)
    zeros.resize(pre.stringbitmap[0].size());

  // second array element is only valid for vertically subsampled 4:2:0
  uint8_t* fontlines_ptr[2] = { nullptr };
  uint8_t* fontoutlines_ptr[2] = { nullptr };

  for (int ty = pre.ystart; ty < pre.yend; ty += ySubS) {

    pixel_t* _dstpU = reinterpret_cast<pixel_t*>(dstpU);
    pixel_t* _dstpV = reinterpret_cast<pixel_t*>(dstpV);

    if (hasVerticalSubsample && odd_y_start && ty == pre.ystart) {
      // top font line on odd y position + vertically subsampled (420)
      fontlines_ptr[0] = zeros.data();
      fontlines_ptr[1] = pre.stringbitmap[ty].data();
      if constexpr(useHalocolor) {
        fontoutlines_ptr[0] = zeros.data();
        fontoutlines_ptr[1] = pre.stringbitmap_outline[ty].data();
      }
    }
    else if (hasVerticalSubsample && ty + 1 - yshift >= pre.stringbitmap_height) {
      // bottom font line on even y position
      fontlines_ptr[0] = pre.stringbitmap[ty - yshift].data();
      fontlines_ptr[1] = zeros.data();
      if constexpr(useHalocolor) {
        fontoutlines_ptr[0] = pre.stringbitmap_outline[ty - yshift].data();
        fontoutlines_ptr[1] = zeros.data();
      }
    }
    else {
      // all font lines contributing to chroma can safely be used
      for (int m = 0; m < ySubS; m++)
        fontlines_ptr[m] = pre.stringbitmap[ty + m - yshift].data();

      if constexpr(useHalocolor) {
        for (int m = 0; m < ySubS; m++)
          fontoutlines_ptr[m] = pre.stringbitmap_outline[ty + m - yshift].data();
      }
    }

    // render a horizontal line
    int j = 0;
    // (pre.xstart - xshift) is always on horizontal subsample boundary
    const int shifted_xstart = pre.safety_bits_x_left + pre.xstart - xshift;

    // left (mpeg2) location: 
    int fontpixels_right = 0; // used for left (mpeg2) chroma location case
    int halopixels_right = 0;
    // For the very first chroma pixel there is no previous rightside.
    // Use index -1, safe because of safety columns
    if constexpr (chromaMode == LEFT_420 || chromaMode == LEFT_422) {
      for (int yy = 0; yy < ySubS; yy++) {
        fontpixels_right += get_bits(fontlines_ptr[yy], shifted_xstart - 1, 1);
        if constexpr (useHalocolor)
          halopixels_right += get_bits(fontoutlines_ptr[yy], shifted_xstart - 1, 1);
      }
    }

    for (int tx = shifted_xstart; tx < shifted_xstart + pre.text_width + xplus; tx += xSubS) {
      int fontpixels = 0;
      int halopixels = 0;

      // 411
      // +------+------+------+------+
      // | 0.25 | 0.25 | 0.25 | 0.25 |
      // +------+------+------+------+
      // 420 center (mpeg1, jpeg)
      // +------+------+
      // | 0.25 | 0.25 |
      // |------+------|
      // | 0.25 | 0.25 |
      // +------+------+
      // 422 center
      // +------+------+
      // | 0.5  | 0.5  |
      // +------+------+
      // 420 left (mpeg2)
      // ------+------+-------+
      // 0.125 | 0.25 | 0.125 |
      // ------|------+-------|
      // 0.125 | 0.25 | 0.125 |
      // ------+------+-------+
      // 422 left (mpeg2)
      // ------+------+-------+
      // 0.25  | 0.5  | 0.25  |
      // ------+------+-------+

      if constexpr (chromaMode == LEFT_420 || chromaMode == LEFT_422) {
        int fontpixels_left = 0;
        int halopixels_left = 0;
        int fontpixels_mid = 0;
        int halopixels_mid = 0;

        // shift variables
        fontpixels_left = fontpixels_right;
        if constexpr (useHalocolor)
          halopixels_left = halopixels_right;
        // gather counts
        fontpixels_mid = get_bits(fontlines_ptr[0], tx, 1);
        fontpixels_right = get_bits(fontlines_ptr[0], tx + 1, 1);
        if constexpr (useHalocolor) {
          halopixels_mid = get_bits(fontoutlines_ptr[0], tx, 1);
          halopixels_right = get_bits(fontoutlines_ptr[0], tx + 1, 1);
        }
        if constexpr (chromaMode == LEFT_420) {
          fontpixels_mid += get_bits(fontlines_ptr[1], tx, 1);
          fontpixels_right += get_bits(fontlines_ptr[1], tx + 1, 1);
          if constexpr (useHalocolor) {
            halopixels_mid += get_bits(fontoutlines_ptr[1], tx, 1);
            halopixels_right += get_bits(fontoutlines_ptr[1], tx + 1, 1);
          }
        }
        // 1-2-1 weight
        fontpixels = fontpixels_left + 2 * fontpixels_mid + fontpixels_right;
        if constexpr (useHalocolor)
          halopixels = halopixels_left + 2 * halopixels_mid + halopixels_right;
      }
      else {
        // center, equal weights
        for (int yy = 0; yy < ySubS; yy++) {
          fontpixels += get_bits(fontlines_ptr[yy], tx, xSubS);
          if constexpr (useHalocolor)
            halopixels += get_bits(fontoutlines_ptr[yy], tx, xSubS);
        }
      }
      LightOneUVPixel<pixel_t, logXRatioUV, logYRatioUV, fadeBackground, chromaMode>(_dstpU, j, _dstpV,
        color_u, color_v, color_outline_u, color_outline_v,
        fontpixels, halopixels,
        bits_per_pixel
        );

      j += 1;
    }

    dstpU += pitchUV;
    dstpV += pitchUV;
  }
}

template<typename pixel_t, bool fadeBackground, bool isRGB>
void do_DrawStringPlanar(
  const int width, const int height, BYTE** dstps, int* pitches, const int logXRatioUV, const int logYRatioUV, const int planeCount,
  int bits_per_pixel,
  const BitmapFont* bmfont, int x, int y, std::vector<int>& s, int color, int halocolor, int align, bool useHalocolor, int chromalocation)
{
  // some optimization hint
  if constexpr (sizeof(pixel_t) == 1)
    bits_per_pixel = 8;
  else if constexpr (sizeof(pixel_t) == 4)
    bits_per_pixel = 32;

  // Chroma 411 would require 3 extra bits on both left and right.
  // Chroma 420 and 422 need 1 bits on both left and right
  // Left (mpeg2) chroma placement (420, 422) requires an additional one on the left.
  const bool isLeftStyleChromaLoc = (logXRatioUV == 1) && 
    ((chromalocation == ChromaLocation_e::AVS_CHROMA_LEFT) || 
      (chromalocation == ChromaLocation_e::AVS_CHROMA_TOP_LEFT) || // not supported yet; for the sake of completeness
      (chromalocation == ChromaLocation_e::AVS_CHROMA_BOTTOM_LEFT)); // not supported yet; for the sake of completeness
  const int safety_bits_x_left = (1 << logXRatioUV) - 1 + (isLeftStyleChromaLoc ? 1 : 0);
  const int safety_bits_x_right = (1 << logXRatioUV) - 1;

  PreRendered pre(bmfont->font_bitmaps.data(), bmfont->fontline_bytes, width, height, x, y, s, align, useHalocolor, 
    bmfont->width, bmfont->height, safety_bits_x_left, safety_bits_x_right);

  if (pre.len <= 0)
    return;

  const bool is444 = !isRGB && (planeCount >= 3) && (logXRatioUV == 0) && (logYRatioUV == 0);

  if (useHalocolor)
    Render1by1Planes<pixel_t, true, fadeBackground, isRGB>(bits_per_pixel, color, halocolor, pitches, dstps, pre, planeCount, is444);
  else
    Render1by1Planes<pixel_t, false, fadeBackground, isRGB>(bits_per_pixel, color, halocolor, pitches, dstps, pre, planeCount, is444);

  if constexpr (isRGB)
    return;

  if (is444)
    return;

  if (planeCount < 3)
    return; // Y

  // Subsampled cases, templates help a lot
  // for 420 and 422 center and left supported only, what is not "center", we do the "left" method
  if (logXRatioUV == 2 && logYRatioUV == 0) {// 411
    // ignore chromalocation
    if (useHalocolor)
      RenderUV<pixel_t, true, fadeBackground, 2, 0, ChromaLocationMode::CENTER_411>(bits_per_pixel, color, halocolor, pitches, dstps, pre);
    else
      RenderUV<pixel_t, false, fadeBackground, 2, 0, ChromaLocationMode::CENTER_411>(bits_per_pixel, color, halocolor, pitches, dstps, pre);
  }
  else if (logXRatioUV == 1 && logYRatioUV == 0) {
    if (chromalocation == ChromaLocation_e::AVS_CHROMA_CENTER) {
      if (useHalocolor)
        RenderUV<pixel_t, true, fadeBackground, 1, 0, ChromaLocationMode::CENTER_422>(bits_per_pixel, color, halocolor, pitches, dstps, pre);
      else
        RenderUV<pixel_t, false, fadeBackground, 1, 0, ChromaLocationMode::CENTER_422>(bits_per_pixel, color, halocolor, pitches, dstps, pre);
    }
    else {
      if (useHalocolor)
        RenderUV<pixel_t, true, fadeBackground, 1, 0, ChromaLocationMode::LEFT_422>(bits_per_pixel, color, halocolor, pitches, dstps, pre);
      else
        RenderUV<pixel_t, false, fadeBackground, 1, 0, ChromaLocationMode::LEFT_422>(bits_per_pixel, color, halocolor, pitches, dstps, pre);
    }
  }
  else if (logXRatioUV == 1 && logYRatioUV == 1) {
    if (chromalocation == ChromaLocation_e::AVS_CHROMA_CENTER) {
      if (useHalocolor)
        RenderUV<pixel_t, true, fadeBackground, 1, 1, ChromaLocationMode::CENTER_420>(bits_per_pixel, color, halocolor, pitches, dstps, pre);
      else
        RenderUV<pixel_t, false, fadeBackground, 1, 1, ChromaLocationMode::CENTER_420>(bits_per_pixel, color, halocolor, pitches, dstps, pre);
    }
    else {
      if (useHalocolor)
        RenderUV<pixel_t, true, fadeBackground, 1, 1, ChromaLocationMode::LEFT_420>(bits_per_pixel, color, halocolor, pitches, dstps, pre);
      else
        RenderUV<pixel_t, false, fadeBackground, 1, 1, ChromaLocationMode::LEFT_420>(bits_per_pixel, color, halocolor, pitches, dstps, pre);

    }
  }
  else
    assert(0);
}

template<bool useHalocolor, bool fadeBackground, ChromaLocationMode chromaMode>
void RenderYUY2(int color, int halocolor, int pitch, BYTE* _dstp, PreRendered& pre)
{
  BYTE* dstp = _dstp + pre.x * 2 + pre.y * pitch;
  BYTE* dstpUV = _dstp + (pre.x / 2) * 4 + 1 + pre.y * pitch; // always points to U of a YUYV block

  typedef uint8_t pixel_t;

  pixel_t val_color = getColorForPlane(PLANAR_Y, color);
  pixel_t val_color_outline = getColorForPlane(PLANAR_Y, halocolor);
  pixel_t color_u = getColorForPlane(PLANAR_U, color);
  pixel_t color_outline_u = getColorForPlane(PLANAR_U, halocolor);
  pixel_t color_v = getColorForPlane(PLANAR_V, color);
  pixel_t color_outline_v = getColorForPlane(PLANAR_V, halocolor);

  // YUY2 like 422
  constexpr int logXRatioUV = 1;
  constexpr int logYRatioUV = 0;
  constexpr int xSubS = 2;

  // unaligned x: for horizontal subsampling 420, 422 (YUY2) and 411:
  // 420, 422, 411, YUY2 horizontal subsampling: one more loop because of the leftmost orphan pixel(s)
  // we can overaddress on the right, additional safety bit column(s) were added for the bitmap
  const bool unaligned_x_start = 0 != pre.x % xSubS;
  const int xplus = unaligned_x_start ? xSubS : 0; // extra orphan bits affecting rightmost chroma
  const int xshift = pre.x % xSubS; // when aligned --> 0

  for (int ty = pre.ystart; ty < pre.yend; ty++) {
    BYTE* dp = dstp;
    BYTE* dpUV = dstpUV;
    uint8_t* fontline_ptr = pre.stringbitmap[ty].data();
    [[maybe_unused]] uint8_t* fontoutline_ptr;
    if constexpr (useHalocolor)
      fontoutline_ptr = pre.stringbitmap_outline[ty].data();

    // first Y, like in 4:4:4
    int j = 0;
    int shifted_xstart = pre.safety_bits_x_left + pre.xstart;
    for (int tx = shifted_xstart; tx < shifted_xstart + pre.text_width; tx++) {
      const bool lightIt = 0 != get_bit(fontline_ptr, tx);
      LightOnePixel<uint8_t, fadeBackground, false>(lightIt, dp, j, val_color, 8);
      if constexpr (useHalocolor) {
        if (!lightIt)
        {
          const bool lightIt_outline = 0 != get_bit(fontoutline_ptr, tx);
          LightOnePixel<uint8_t, fadeBackground, false>(lightIt_outline, dp, j, val_color_outline, 8);
        }
      }
      j += 2; // next Y
    }

    // then chroma
    j = 0;
    // (pre.xstart - xshift) is always on horizontal subsample boundary
    shifted_xstart = pre.safety_bits_x_left + pre.xstart - xshift;

    // left (mpeg2) location: 
    int fontpixels_right = 0; // used for left (mpeg2) chroma location case
    int halopixels_right = 0;
    // For the very first chroma pixel there is no previous rightside.
    // Use index -1, safe because of safety columns
    if constexpr (chromaMode == LEFT_422) {
      fontpixels_right = get_bits(fontline_ptr, shifted_xstart - 1, 1);
      if constexpr (useHalocolor)
        halopixels_right = get_bits(fontoutline_ptr, shifted_xstart - 1, 1);
    }

    for (int tx = shifted_xstart; tx < shifted_xstart + pre.text_width + xplus; tx += xSubS) {
      int fontpixels = 0;
      int halopixels = 0;
      // 422 center (mpeg1, jpeg)
      // +------+------+
      // | 0.5  | 0.5  |
      // +------+------+
      // 422 left (mpeg2)
      // ------+------+-------+
      // 0.25  | 0.5  | 0.25  |
      // ------+------+-------+

      if constexpr (chromaMode == LEFT_422) {
        int fontpixels_left = 0;
        int halopixels_left = 0;
        int fontpixels_mid = 0;
        int halopixels_mid = 0;

        // shift variables
        fontpixels_left = fontpixels_right;
        if constexpr (useHalocolor)
          halopixels_left = halopixels_right;
        // gather counts
        fontpixels_mid = get_bits(fontline_ptr, tx, 1);
        fontpixels_right = get_bits(fontline_ptr, tx + 1, 1);
        if constexpr (useHalocolor) {
          halopixels_mid = get_bits(fontoutline_ptr, tx, 1);
          halopixels_right = get_bits(fontoutline_ptr, tx + 1, 1);
        }
        // 1-2-1 weight
        fontpixels = fontpixels_left + 2 * fontpixels_mid + fontpixels_right;
        if constexpr (useHalocolor)
          halopixels = halopixels_left + 2 * halopixels_mid + halopixels_right;
      }
      else {
        // center, equal weights
        fontpixels += get_bits(fontline_ptr, tx, xSubS);
        if constexpr (useHalocolor)
          halopixels += get_bits(fontoutline_ptr, tx, xSubS);
      }

      LightOneUVPixel<uint8_t, logXRatioUV, logYRatioUV, fadeBackground, chromaMode>(dpUV /*U*/, j, dpUV + 2 /*V*/,
        color_u, color_v, color_outline_u, color_outline_v,
        fontpixels, halopixels,
        8
        );
      j += 4; // YUYV
    }

    dstp += pitch;
    dstpUV += pitch;
  }
}

template<bool fadeBackground>
static void do_DrawStringYUY2(
  const int width, const int height, BYTE* _dstp, int pitch, const BitmapFont* bmfont, int x, int y, std::vector<int>& s,
  int color, int halocolor, int align, bool useHalocolor, int chromalocation)
{
  const bool isLeftStyleChromaLoc = chromalocation == ChromaLocation_e::AVS_CHROMA_LEFT;
  // Like 422. Chroma subsampling would require 1 extra bit playground on both left and right
  const int safety_bits_x_left = 1 + (isLeftStyleChromaLoc ? 1 : 0);
  const int safety_bits_x_right = 1;

  PreRendered pre(bmfont->font_bitmaps.data(), bmfont->fontline_bytes, width, height, x, y, s, align, useHalocolor, bmfont->width, bmfont->height,
    safety_bits_x_left, safety_bits_x_right);

  if (pre.len <= 0)
    return;

  if (useHalocolor) {
    if (chromalocation == ChromaLocation_e::AVS_CHROMA_CENTER)
      RenderYUY2<true, fadeBackground, ChromaLocationMode::CENTER_422>(color, halocolor, pitch, _dstp, pre);
    else
      RenderYUY2<true, fadeBackground, ChromaLocationMode::LEFT_422>(color, halocolor, pitch, _dstp, pre);
  }
  else {
    if (chromalocation == ChromaLocation_e::AVS_CHROMA_CENTER)
      RenderYUY2<false, fadeBackground, ChromaLocationMode::CENTER_422>(color, halocolor, pitch, _dstp, pre);
    else
      RenderYUY2<false, fadeBackground, ChromaLocationMode::LEFT_422>(color, halocolor, pitch, _dstp, pre);
  }
}

template<typename pixel_t, bool useHalocolor, bool fadeBackground, int rgbstep>
static void RenderPackedRGB(int color, int halocolor, BYTE* _dstp, int pitch, int height, PreRendered& pre)
{
  // packed: only 8 and 16 bits
  int bits_per_pixel = 0;
  if constexpr (sizeof(pixel_t) == 1)
    bits_per_pixel = 8;
  else if constexpr (sizeof(pixel_t) == 2)
    bits_per_pixel = 16;

  const int val_color_R = getHBDColor_RGB<pixel_t>(getColorForPlane(PLANAR_R, color), bits_per_pixel);
  const int val_color_R_outline = getHBDColor_RGB<pixel_t>(getColorForPlane(PLANAR_R, halocolor), bits_per_pixel);
  const int val_color_G = getHBDColor_RGB<pixel_t>(getColorForPlane(PLANAR_G, color), bits_per_pixel);
  const int val_color_G_outline = getHBDColor_RGB<pixel_t>(getColorForPlane(PLANAR_G, halocolor), bits_per_pixel);
  const int val_color_B = getHBDColor_RGB<pixel_t>(getColorForPlane(PLANAR_B, color), bits_per_pixel);
  const int val_color_B_outline = getHBDColor_RGB<pixel_t>(getColorForPlane(PLANAR_B, halocolor), bits_per_pixel);

  // upside down
  BYTE* dstp = _dstp + pre.x * rgbstep + (height - 1 - pre.y) * pitch;

  // Start rendering
  for (int ty = pre.ystart; ty < pre.yend; ty++) {
    uint8_t* dp = dstp;
    uint8_t* fontline_ptr = pre.stringbitmap[ty].data();
    [[maybe_unused]] uint8_t* fontoutline_ptr;
    if constexpr(useHalocolor)
      fontoutline_ptr = pre.stringbitmap_outline[ty].data();

    const int shifted_xstart = pre.safety_bits_x_left + pre.xstart; // though safety bit count must be 0 here
    for (int tx = shifted_xstart; tx < shifted_xstart + pre.text_width; tx++)
    {
      const bool lightIt = 0 != get_bit(fontline_ptr, tx);
      LightOnePixelPackedRGB<pixel_t, fadeBackground>(lightIt, dp, val_color_R, val_color_G, val_color_B);
      if constexpr(useHalocolor) {
        if (!lightIt) {
          const bool lightIt_outline = 0 != get_bit(fontoutline_ptr, tx);
          LightOnePixelPackedRGB<pixel_t, fadeBackground>(lightIt_outline, dp, val_color_R_outline, val_color_G_outline, val_color_B_outline);
        }
      }
      dp += rgbstep;
    }
    dstp -= pitch;
  }
}

template<typename pixel_t, int rgbstep, bool fadeBackground>
static void do_DrawStringPackedRGB(
  const int width, const int height, BYTE* _dstp, int pitch,
  const BitmapFont* bmfont, int x, int y, std::vector<int>& s, int color, int halocolor, int align, bool useHalocolor)
{
  const int safety_bits_x_left = 0; // no horizontal subsampling
  const int safety_bits_x_right = 0; // no horizontal subsampling
  PreRendered pre(bmfont->font_bitmaps.data(), bmfont->fontline_bytes, width, height, x, y, s, align, useHalocolor, bmfont->width, bmfont->height,
    safety_bits_x_left, safety_bits_x_right);

  if (pre.len <= 0)
    return;

  if (useHalocolor)
    RenderPackedRGB<pixel_t, true, fadeBackground, rgbstep>(color, halocolor, _dstp, pitch, height, pre);
  else
    RenderPackedRGB<pixel_t, false, fadeBackground, rgbstep>(color, halocolor, _dstp, pitch, height, pre);
}


static bool strequals_i(const std::string& a, const std::string& b)
{
  return std::equal(a.begin(), a.end(),
    b.begin(), b.end(),
    [](char a, char b) {
      return tolower(a) == tolower(b);
    });
}

// in fixedfonts.cpp
extern const uint16_t *font_bitmaps[];
extern const uint16_t *font_codepoints[];
extern const FixedFont_info_t *font_infos[];

std::unique_ptr<BitmapFont> GetBitmapFont(int size, const char *name, bool bold, bool debugSave) {

  BitmapFont* current_font = nullptr;

  // check internal embedded fonts
  bool found = false;

  // find font in internal list
  for (int i = 0; i < PREDEFINED_FONT_COUNT; i++)
  {
    const FixedFont_info_t* fi = font_infos[i];
    if (fi->height == size && fi->bold == bold && strequals_i(fi->fontname, name)) {
      current_font = new BitmapFont(
        fi->charcount,
        font_bitmaps[i], // internal one
        nullptr, // not a byte array from external BDF
        sizeof(uint16_t), // sizeof(*font_bitmaps) 2: uint16_t
        font_codepoints[i],
        fi->width,
        fi->height,
        fi->fontname,
        "",
        fi->bold,
        false);
      found = true;
      break;
    }
  }
  // pass #2 when size does not match exactly, find nearest, but still smaller font.
  if (!found) {
    // find font i internal list
    int last_good_size = 0;
    int found_index = -1;
    for (int i = 0; i < PREDEFINED_FONT_COUNT; i++)
    {
      const FixedFont_info_t* fi = font_infos[i];
      if (fi->bold == bold && strequals_i(fi->fontname, name)) {
        if (last_good_size == 0) {
          found_index = i;
          last_good_size = fi->height;
        }
        else if (std::abs(fi->height - size) < std::abs(last_good_size - size) && fi->height <= size) {
          // has better size match and is not larger
          found_index = i;
          last_good_size = fi->height;
        }
      }
    }
    if (found_index >= 0) {
      const FixedFont_info_t* fi = font_infos[found_index];
      current_font = new BitmapFont(
        fi->charcount,
        font_bitmaps[found_index], // internal one
        nullptr, // not a byte array from external BDF
        sizeof(uint16_t), // sizeof(*font_bitmaps) 2: uint16_t
        font_codepoints[found_index],
        fi->width,
        fi->height,
        fi->fontname,
        "",
        fi->bold,
        false);
      found = true;
    }
  }

  if (!found) {
    // fixme: make cache
    BdfFont bdf;
    bdf = LoadBMF(name, bold);
    if (bdf.codepoints_array.size() == 0)
      return nullptr;

    current_font = new BitmapFont(
      bdf.font_info.chars,
      nullptr, // not an internal one
      bdf.font_bitmaps.data(), // a byte array from external BDF
      bdf.font_info.fontline_bytes,
      bdf.codepoints_array.data(),
      bdf.font_info.font_bounding_box_x,
      bdf.font_info.font_bounding_box_y,
      bdf.font_info.font,
      bdf.font_filename,
      strequals_i(bdf.font_properties.Weight_name, "bold"),
      debugSave);
  }
  return std::unique_ptr<BitmapFont>(current_font);
}

static void DrawString_internal(BitmapFont* current_font, const VideoInfo& vi, PVideoFrame& dst, int x, int y, std::string& s_utf8,
  int color, int halocolor, bool useHalocolor, int align, bool fadeBackground, int chromalocation)
{
  //static BitmapFont_10_20 infoFont1020; // constructor runs once, single instance

  // map an utf8 string to a sequence of character map indexes
  auto s_remapped = current_font->remap(s_utf8); // array of font table indexes

  //SaveBitmapSource(); // debug to generate source from original table

  const bool isRGB = vi.IsRGB();
  const int planes_y[4] = { PLANAR_Y, PLANAR_U, PLANAR_V, PLANAR_A };
  const int planes_r[4] = { PLANAR_G, PLANAR_B, PLANAR_R, PLANAR_A };
  const int* planes = isRGB ? planes_r : planes_y;

  int logXRatioUV = 0;
  int logYRatioUV = 0;
  if (!vi.IsY() && !vi.IsRGB()) {
    logXRatioUV = vi.IsYUY2() ? 1 : vi.GetPlaneWidthSubsampling(PLANAR_U);
    logYRatioUV = vi.IsYUY2() ? 0 : vi.GetPlaneHeightSubsampling(PLANAR_U);
  }
  const int planecount = vi.IsYUY2() ? 1 : std::min(vi.NumComponents(), 3);
  BYTE* dstps[3] = { nullptr };
  int pitches[3] = { 0 };

  for (int i = 0; i < planecount; i++)
  {
    int plane = planes[i];
    dstps[i] = dst->GetWritePtr(plane);
    pitches[i] = dst->GetPitch(plane);
  }

  const int width = vi.width;
  const int height = vi.height;

  const int bits_per_pixel = vi.BitsPerComponent();

  // narrow down valid chroma choices, ignoring and moving to default what is not supported at the moment
  if (vi.IsYV411()) {
    // ignored, always left
    chromalocation = ChromaLocation_e::AVS_CHROMA_LEFT;
  }
  else if (vi.Is420() || vi.Is422() || vi.IsYUY2()) {
    if (chromalocation != ChromaLocation_e::AVS_CHROMA_CENTER)
      chromalocation = ChromaLocation_e::AVS_CHROMA_LEFT;
    // When CENTER is specified, do "center", all other cases fall back 
    // to "left" (mpeg2). Option is meaningful only 420 or 422 formats, otherwise ignored.
  }

  // YUY2
  if (vi.IsYUY2()) {
    if (fadeBackground)
      do_DrawStringYUY2<true>(width, height, dstps[0], pitches[0], current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
    else
      do_DrawStringYUY2<false>(width, height, dstps[0], pitches[0], current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
    return;
  }

  // Packed RGB24/32/48/64
  if (isRGB && !vi.IsPlanar()) {
    if (fadeBackground) {
      if (vi.IsRGB24())
        do_DrawStringPackedRGB<uint8_t, 3, true>(width, height, dstps[0], pitches[0], current_font, x, y, s_remapped, color, halocolor, align, useHalocolor);
      else if (vi.IsRGB32())
        do_DrawStringPackedRGB<uint8_t, 4, true>(width, height, dstps[0], pitches[0], current_font, x, y, s_remapped, color, halocolor, align, useHalocolor);
      else if (vi.IsRGB48())
        do_DrawStringPackedRGB<uint16_t, 6, true>(width, height, dstps[0], pitches[0], current_font, x, y, s_remapped, color, halocolor, align, useHalocolor);
      else if (vi.IsRGB64())
        do_DrawStringPackedRGB<uint16_t, 8, true>(width, height, dstps[0], pitches[0], current_font, x, y, s_remapped, color, halocolor, align, useHalocolor);
    }
    else {
      if (vi.IsRGB24())
        do_DrawStringPackedRGB<uint8_t, 3, false>(width, height, dstps[0], pitches[0], current_font, x, y, s_remapped, color, halocolor, align, useHalocolor);
      else if (vi.IsRGB32())
        do_DrawStringPackedRGB<uint8_t, 4, false>(width, height, dstps[0], pitches[0], current_font, x, y, s_remapped, color, halocolor, align, useHalocolor);
      else if (vi.IsRGB48())
        do_DrawStringPackedRGB<uint16_t, 6, false>(width, height, dstps[0], pitches[0], current_font, x, y, s_remapped, color, halocolor, align, useHalocolor);
      else if (vi.IsRGB64())
        do_DrawStringPackedRGB<uint16_t, 8, false>(width, height, dstps[0], pitches[0], current_font, x, y, s_remapped, color, halocolor, align, useHalocolor);
    }
    return;
  }

  // planar and Y
  if (fadeBackground) {
    if (isRGB) {
      switch (bits_per_pixel)
      {
      case 8:
        do_DrawStringPlanar<uint8_t, true, true>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      case 10:
      case 12:
      case 14:
      case 16:
        do_DrawStringPlanar<uint16_t, true, true>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      case 32:
        do_DrawStringPlanar<float, true, true>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      }
    }
    else {
      switch (bits_per_pixel)
      {
      case 8:
        do_DrawStringPlanar<uint8_t, true, false>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      case 10:
      case 12:
      case 14:
      case 16:
        do_DrawStringPlanar<uint16_t, true, false>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      case 32:
        do_DrawStringPlanar<float, true, false>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      }
    }
  }
  else {
    if (isRGB) {
      switch (bits_per_pixel)
      {
      case 8:
        do_DrawStringPlanar<uint8_t, false, true>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      case 10:
      case 12:
      case 14:
      case 16:
        do_DrawStringPlanar<uint16_t, false, true>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      case 32:
        do_DrawStringPlanar<float, false, true>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      }
    }
    else {
      switch (bits_per_pixel)
      {
      case 8:
        do_DrawStringPlanar<uint8_t, false, false>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      case 10:
      case 12:
      case 14:
      case 16:
        do_DrawStringPlanar<uint16_t, false, false>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      case 32:
        do_DrawStringPlanar<float, false, false>(width, height, dstps, pitches, logXRatioUV, logYRatioUV, planecount, bits_per_pixel, current_font, x, y, s_remapped, color, halocolor, align, useHalocolor, chromalocation);
        break;
      }
    }
  }
}

void SimpleTextOutW(BitmapFont *current_font, const VideoInfo& vi, PVideoFrame& frame, int real_x, int real_y, std::string& text_utf8,
  bool fadeBackground, int textcolor, int halocolor, bool useHaloColor, int align, int chromalocation)
{
  DrawString_internal(current_font, vi, frame, real_x, real_y, text_utf8, textcolor, halocolor, useHaloColor, align, fadeBackground, chromalocation); // fully transparent background
}

// additional parameter: lsp line spacing
void SimpleTextOutW_multi(BitmapFont *current_font, const VideoInfo& vi, PVideoFrame& frame, int real_x, int real_y, std::string& text_utf8,
  bool fadeBackground, int textcolor, int halocolor, bool useHaloColor,
  int align, int lsp, int chromalocation)
{

  // make list governed by LF separator
  std::string temp;
  std::vector<std::string> parts;
  std::stringstream ss(text_utf8);
  while (std::getline(ss, temp, '\n'))
    parts.push_back(temp); // still in utf8
  // It doesn't result in a new line if the last character is \n and is followed by nothing.
  // "Line1\nLine2" is the same as "Line1\nLine2\n"
  // Like in SubTitle
  /*
  if(!text.empty())
  {
    if( *text.rbegin() == '\n')
      parts.push_back("");
  }
  */
  const int fontSize = current_font->height;

  // when multiline, bottom and vertically centered cases affect starting y
  // lsp units are in 1/8 pixels by definition
  int al = alignToBitmask(align);
  if (al & ATA_BOTTOM)
    real_y -= (int)((fontSize + lsp / 8.0) * ((int)parts.size() - 1) + 0.5);
  else if (al & ATA_BASELINE)
    real_y -= (int)((fontSize + lsp / 8.0) * (int)(parts.size() - 1) / 2.0 + 0.5);

  const int orig_real_y = real_y;
  int linecount = 0;
  for (auto s_utf8 : parts) {
    real_y = orig_real_y + fontSize * linecount + (int)(lsp / 8.0 * linecount + 0.5);
    SimpleTextOutW(current_font, vi, frame, real_x, real_y, s_utf8, fadeBackground, textcolor, halocolor, useHaloColor, align, chromalocation);
    linecount++;
  }
}

// Old legacy info.h functions, but with utf8 mode
// w/o outline, originally with ASCII input, background fading
// Despite name Planar, it works for all formats
void DrawStringPlanar(VideoInfo& vi, PVideoFrame& dst, int x, int y, const char* s)
{
  int color;
  if (vi.IsRGB())
    color = (250 << 16) + (250 << 8) + (250);
  else
    color = (230 << 16) + (128 << 8) + (128);

  // fadeBackground = true: background letter area is faded instead not being untouched.

  std::string s_utf8 = charToUtf8(s, false);

  int halocolor = 0;

  std::unique_ptr<BitmapFont> current_font = GetBitmapFont(20, "info_h", false, false); // 10x20

  if (current_font == nullptr)
    return;

  DrawString_internal(current_font.get(), vi, dst, x, y, s_utf8,
    color,
    halocolor,
    false, // don't use halocolor
    0 /* no align */,
    true, // fadeBackGround
    ChromaLocation_e::AVS_CHROMA_LEFT
  );

}

void DrawStringYUY2(VideoInfo& vi, PVideoFrame& dst, int x, int y, const char* s)
{
  DrawStringPlanar(vi, dst, x, y, s);
}

// legacy function w/o outline, originally with ASCII input, background fading
void DrawStringRGB24(VideoInfo &vi, PVideoFrame& dst, int x, int y, const char* s)
{
  DrawStringPlanar(vi, dst, x, y, s);
}

// legacy function w/o outline, originally with ASCII input, background fading
void DrawStringRGB32(VideoInfo& vi, PVideoFrame& dst, int x, int y, const char* s)
{
  DrawStringPlanar(vi, dst, x, y, s);
}
